Copyright (c) 2015 The Hyve B.V. This notebook is licensed under the GNU General Public License, version 3. Authors: Ruslan Forostianov, Pieter Lukasse. Parts of the R script: Copyright 2014 Janssen Research & Development, LLC, & Copyright (c) 2015 The Hyve B.V., based on demoTransmartRClientCommands.R also made available as GPL v3 in this Jupyter home dir.
Authenticate with tranSMART first if you want to execute any of the analysis in the boxes below again.
Step 1: Please open URL http://localhost:8080/transmart/oauth/authorize?response_type=code&client_id=api-client&client_secret=api-client&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2Ftransmart%2Foauth%2Fverify
Step 2: paste token in the token parameter below.
In [ ]:
require("transmartRClient")
connectToTransmart("http://localhost:8080/transmart", prefetched.request.token = "OjNGZi")
If the output above is: Authentication completed. TRUE , then you can continue below.
In [ ]:
study <- "GSE8581"
# Retrieve Clinical Data
allObservations <- getObservations(study, as.data.frame = T)
In [ ]:
dataDownloaded <- getHighdimData(study.name = study, concept.match = "Lung", projection = "log_intensity")
In [ ]:
summary(dataDownloaded)
In [ ]:
# select gene expression data, which is the data *excluding* columns 1 to 6:
data<-dataDownloaded[["data"]]
expression_data<-data[,-c(1:6)]
expression_data[1:3,1:3]
dim(expression_data)
# add patientId as the row name for the expression_data matrix:
rownames(expression_data)<-data$patientId
expression_data[1:3,1:3]
If the dimensions of the expression_data table are large, you may want to create a subset of the data first. Here we use a probelist as a subset for the probes, based on the list found in: "Bhattacharya S., Srisuma S., Demeo D. L., et al., Molecular biomarkers for quantitative and discrete COPD phenotypes.American Journal of Respiratory Cell and Molecular Biology. 2009;40(3):359–367. doi: 10.1165/rcmb.2008-0114OC."
In [ ]:
#Select a subset of the probes:
probeNames<- c("1552622_s_at","1555318_at","1557293_at","1558280_s_at","1558411_at","1558515_at","1559964_at","204284_at","205051_s_at","205528_s_at","208835_s_at","209377_s_at","209815_at","211548_s_at","212179_at","212263_at","213156_at","213269_at","213650_at","213878_at","215359_x_at","215933_s_at","218352_at","218490_s_at","220094_s_at","220906_at","220925_at","222108_at","224711_at","225318_at","225595_at","225835_at","225892_at","226316_at","226492_at","226534_at","226666_at","226800_at","227095_at","227105_at","227148_at","227812_at","227852_at","227930_at","227947_at","228157_at","228630_at","228665_at","228760_at","228850_s_at","228875_at","228963_at","229111_at","229572_at","230142_s_at","230986_at","232014_at","235423_at","235810_at","238712_at","238992_at","239842_x_at","239847_at","241936_x_at","242389_at")
#note: this is because R automatically prepends "X" in front of column names that start with a numerical value. Therefore prepend "X"
probeNames<- paste("X", probeNames, sep = "")
In [ ]:
# select only the cases and controls (excluding the patients for which the lung disease is not specified). Note: in the observation table the database IDs
# are used to identify the patients and not the patient IDs that are used in the gene expression dataset
cases <- allObservations$observations$subject.id[allObservations$observations$'Subjects_Lung Disease' == "chronic obstructive pulmonary disease"]
controls <- allObservations$observations$subject.id[allObservations$observations$'Subjects_Lung Disease' == "control"]
par(pin=c(5,2))
barplot(c(length(cases),length(controls)), main="Cases and controls", horiz=TRUE,
names.arg=c("cases", "controls"))
In [ ]:
# now we have the *internal database* IDs for the patients, but we need to get the patient IDs because
# this is the index of the expression_data matrix.
# These can be retrieved from the subjectInfo table:
subjectInfo <- allObservations$subjectInfo
patientIDsCase <- subjectInfo$subject.inTrialId[ subjectInfo$subject.id %in% cases ]
patientIDsControl <- subjectInfo$subject.inTrialId[ subjectInfo$subject.id %in% controls]
# patient sets containing case and control patientIDs
patientSets <- c(patientIDsCase, patientIDsControl)
patientSets <- patientSets[which(patientSets %in% rownames(expression_data))]
# make a subset of the data based on the selected patientSets and the probelist, and transpose the
# table so that the rows now contain probe names
subset<-t(expression_data[patientSets,probeNames])
# for ease of recognition: append "Case" and "Control" to the patient names
colnames(subset)[colnames(subset)%in% patientIDsCase] <- paste(colnames(subset)[colnames(subset)%in% patientIDsCase],"Case", sep="_" )
colnames(subset)[colnames(subset)%in% patientIDsControl] <- paste( colnames(subset)[colnames(subset)%in% patientIDsControl] , "Control",sep= "_")
# there is one patient that seems to be an outlier: GSE8581GSM212810_Case.
# remove this outlier and plot heatmap again
subset_without_outlier <- subset[,colnames(subset)!= "GSE8581GSM212810_Case"]
In [ ]:
# PCA analysis :
options(warn=-1)# to turn warnings back on, use : options(warn=0)
subset_t <- t(subset_without_outlier)
prcomp_result <- prcomp(x = subset_t)
result_pca <- prcomp_result$x
#result_pca
rownames_pca_result <- rownames(result_pca)
colors <- c()
for (row in rownames_pca_result){ colors <- c(colors, ifelse(grepl("Case", row), "red", "blue")) }
plot(result_pca[,1], result_pca[,2], col=colors)
legend("topright", c("Case","Control"), pch = 1, col=c("red", "blue"))
In [ ]:
#3D
# from packages("plot3D")
# Enable this line to see documentation:
#?plot3D::scatter3D